######################################################################
### Fig 1: Within-industry Variance in Applied Tariff Rates
######################################################################
rm(list=ls())

## setting the working directory to the current folder
setwd(getwd())

Data <- read.csv("./mfn_us_89_11.csv")
Data$Year <- as.numeric(Data$Year)
## subset data up to 2008
data <- Data[which(Data$Year<=2008),]

MakeHS8 <- function(hs10) {
    hs.i <- as.character(hs10)
    hs.i <- gsub(" ", "", hs.i)
    digits <- length(strsplit(hs.i, split="")[[1]])
    if (digits == 10){
        hs8 <- substring(hs.i, 1,8)
    } else if (digits == 9){
        hs.i <- paste("0", hs.i, sep="")
        hs8 <- substring(hs.i, 1,8)
    } else if (digits == 8){
        hs8 <- hs.i
    } else if (digits == 7){
        hs8 <- paste("0", hs.i, sep="")
    }  else {
        print(hs.i)
        stop("HS code digit is less than 7")
    }
    return(hs8)
}

MakeHS6 <- function(hs8) {
    hs.i <- as.character(hs8)
    hs.i <- gsub(" ", "", hs.i)
    digits <- length(strsplit(hs.i, split="")[[1]])
    if (digits == 8){
        hs6 <- substring(hs.i, 1,6)
    } else {
        print(hs.i)
        stop("HS code digit is not 8")
    }
    return(hs6)
}


MakeHS4 <- function(hs8) {
    hs.i <- as.character(hs8)
    hs.i <- gsub(" ", "", hs.i)
    digits <- length(strsplit(hs.i, split="")[[1]])
    if (digits == 8){
        hs4 <- substring(hs.i, 1,4)
    } else {
        print(hs.i)
        stop("HS code digit is not 8")
    }
    return(hs4)
}

MakeHS2 <- function(hs8) {
    hs.i <- as.character(hs8)
    hs.i <- gsub(" ", "", hs.i)
    digits <- length(strsplit(hs.i, split="")[[1]])
    if (digits == 8){
        hs2 <- substring(hs.i, 1,2)
    } else {
        print(hs.i)
        stop("HS code digit is not 8")
    }
    return(hs2)
}

data$hs8 <- sapply(data$ProductCode, MakeHS8)
data$hs6 <- sapply(data$hs8, MakeHS6)
data$hs4 <- sapply(data$hs8, MakeHS4)
data$hs2 <- sapply(data$hs8, MakeHS2)

missing <- which(is.na(data$AdValorem))
if (length(missing)>0){
    data <- data[-missing,]
}

## unique hs2
uniq.hs2 <- unique(data$hs2)
rm <- which(is.na(uniq.hs2))
rm <- c(rm,which(uniq.hs2==""))
if (length(rm)>0){
    uniq.hs2 <- uniq.hs2[-rm]
}


years <- sort(unique(data$Year))

var.total <- rep(NA, length(years))
var.within <- rep(NA, length(years))
var.between <- rep(NA, length(years))
nhs8 <- rep(NA, length(years))

mean.total <- rep(NA, length(years))
var.industry <- matrix(NA, nrow=length(uniq.hs2), ncol=length(years))
rownames(var.industry) <- as.character(uniq.hs2)
colnames(var.industry) <- as.character(years)


for (t in 1:length(years)){
    print(t)
    year.t <- years[t]
    ## subset data for year t
    sub.t <- subset(data, subset=(data$Year==year.t))
    
    quant <- quantile(sub.t$AdValorem, probs=seq(from=0,to=1,by=0.0001), na.rm=T)
    cutoff <- quant[10000] # 99.99
    
    ## remove NA, missing, outliers values
    rm <- which(is.na(sub.t$AdValorem))
    rm <- c(rm,which(sub.t$AdValorem==" "))
    rm <- unique(c(rm, which(sub.t$AdValorem>cutoff)))
    if (length(rm)>0){
        sub.t <- sub.t[-rm,]
    }

    ## total mean tau
    m.tau.t <- mean(sub.t$AdValorem, na.rm=T)
    mean.total[t] <- m.tau.t
    
    ## unique hs2 ------------------------------------------------
    uniq.hs2 <- unique(sub.t$hs2)
    rm <- which(is.na(uniq.hs2))
    rm <- c(rm,which(uniq.hs2==""))
    if (length(rm)>0){
        uniq.hs2 <- uniq.hs2[-rm]
    }
    
    tmp.1 <- 0 # total
    tmp.2 <- 0 # within
    tmp.3 <- 0 # beween
    
    for (j in 1:length(uniq.hs2)){
        ## print(j)
        hs.j <- uniq.hs2[j]
        ## subset data for industry j
        sub.j <- subset(sub.t, subset=(sub.t$hs2==hs.j))
        m.tau.j <- mean(sub.j$AdValorem, na.rm=T)

        tmp.ij1 <- 0
        tmp.ij2 <- 0

        if (nrow(sub.j)>0){
            
            for (i in 1:nrow(sub.j)){
                tau.ijt <- sub.j$AdValorem[i]
                ## consider if not NA

                if (!is.na(tau.ijt)){
                    tmp.ij1 <- tmp.ij1 + (tau.ijt-m.tau.t)^2
                    tmp.ij2 <- tmp.ij2 + (tau.ijt-m.tau.j)^2
                } else {
                    tmp.ij1 <- tmp.ij1
                    tmp.ij2 <- tmp.ij2
                }
            }
            
            tmp.1 <- tmp.1 + tmp.ij1
            tmp.2 <- tmp.2 + tmp.ij2
            tmp.3 <- tmp.3 + nrow(sub.j)*((m.tau.j - m.tau.t)^2)
            
        } else {
            tmp.1 <- tmp.1
            tmp.2 <- tmp.2
            tmp.3 <- tmp.3        
        }
        
    }
    
    var.total[t] <- (1/nrow(sub.t))*tmp.1 # 52.44892
    var.within[t] <- (1/nrow(sub.t))*tmp.2
    var.between[t] <- (1/nrow(sub.t))*tmp.3
    
    nhs8[t] <- nrow(sub.t)

    
}


HS2Var <- cbind(var.between, var.within, var.total, var.within/var.total)


pdf(file="./Figure1.pdf",
    width=12, height=8)

par(mar=c(5,6,2,2), cex.axis=1.3)
plot(years, var.total, pch=17, col="black", ylim=c(0,220), type="b",
     ylab="", xlab="", xaxt='n',
     xlim=c(1989,2009))
points(years, var.within, pch=19, col="red", type="b")
points(years, var.between, pch=21, col="blue", type="b")

text(2004, 180, "Total Variance", cex=2)
text(2002, 110, "Within Industry Variance", cex=2, col="red")
text(2002, 50, "Between Industry Variance", cex=2, col="blue")



text(1991.5, 150, "Uruguay Round", cex=1.7, col="gray20")
arrows(1988.5, 135, 1994.5, 135 , length = 0.25, angle = 30,
       code = 2, cex=1.1, col="gray50")


mtext(side=2, "Variance in Ad-valorem Tariff Rates", line=3, cex=2.5)


axis(1, at=c(1989,1991,1993,1995,1997,1999,2001,2003,2005,2007,2009,2011),
     labels=c("1989","1991","1993","1995","1997","1999","2001",
         "2003","2005","2007","2009", "2011"))


rect(1988,0, 1995,250,col = rgb(0.5,0.5,0.5,1/6),
     border=NA)#xleft,ybottom, xtop, ytop


dev.off()

